require(knitr)
## Loading required package: knitr
knitr::opts_chunk$set(echo = TRUE,warning = FALSE, message = FALSE)
knitr::opts_chunk$set(fig.width = 15, fig.height = 10)
knitr::opts_chunk$set(comment = NA)
# Change this working directory to be your location of the session 7 folder
knitr::opts_knit$set(root.dir="~/Desktop/code_club_curriculum/Session_7/")
Note: This code chunk loads the required packages for this code club session. Instead of using the library function consecutively for each package, I often use this lapply function strategy to load my packages at the start of my R markdown file.
# Packages used
# Cran packages
cran_packages <- c("BiocManager","phytools","ggplot2","tidyverse","readxl","randomcoloR","cowplot","printr")
# BiocManager packages
BiocManager_packages <- c("ggtree","ggnewscale")
# Functions to install packages (if not installed already)
# lapply(cran_packages,install.packages,character.only=T)
# lapply(BiocManager_packages,BiocManager::install,character.only=T)
# Functions to load packages
lapply(cran_packages,library,character.only=T)
[[1]]
[1] "BiocManager" "knitr" "stats" "graphics" "grDevices"
[6] "utils" "datasets" "methods" "base"
[[2]]
[1] "phytools" "maps" "ape" "BiocManager" "knitr"
[6] "stats" "graphics" "grDevices" "utils" "datasets"
[11] "methods" "base"
[[3]]
[1] "ggplot2" "phytools" "maps" "ape" "BiocManager"
[6] "knitr" "stats" "graphics" "grDevices" "utils"
[11] "datasets" "methods" "base"
[[4]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "tidyverse" "ggplot2" "phytools"
[11] "maps" "ape" "BiocManager" "knitr" "stats"
[16] "graphics" "grDevices" "utils" "datasets" "methods"
[21] "base"
[[5]]
[1] "readxl" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "tidyverse" "ggplot2"
[11] "phytools" "maps" "ape" "BiocManager" "knitr"
[16] "stats" "graphics" "grDevices" "utils" "datasets"
[21] "methods" "base"
[[6]]
[1] "randomcoloR" "readxl" "forcats" "stringr" "dplyr"
[6] "purrr" "readr" "tidyr" "tibble" "tidyverse"
[11] "ggplot2" "phytools" "maps" "ape" "BiocManager"
[16] "knitr" "stats" "graphics" "grDevices" "utils"
[21] "datasets" "methods" "base"
[[7]]
[1] "cowplot" "randomcoloR" "readxl" "forcats" "stringr"
[6] "dplyr" "purrr" "readr" "tidyr" "tibble"
[11] "tidyverse" "ggplot2" "phytools" "maps" "ape"
[16] "BiocManager" "knitr" "stats" "graphics" "grDevices"
[21] "utils" "datasets" "methods" "base"
[[8]]
[1] "printr" "cowplot" "randomcoloR" "readxl" "forcats"
[6] "stringr" "dplyr" "purrr" "readr" "tidyr"
[11] "tibble" "tidyverse" "ggplot2" "phytools" "maps"
[16] "ape" "BiocManager" "knitr" "stats" "graphics"
[21] "grDevices" "utils" "datasets" "methods" "base"
lapply(BiocManager_packages,library,character.only=T)
[[1]]
[1] "ggtree" "printr" "cowplot" "randomcoloR" "readxl"
[6] "forcats" "stringr" "dplyr" "purrr" "readr"
[11] "tidyr" "tibble" "tidyverse" "ggplot2" "phytools"
[16] "maps" "ape" "BiocManager" "knitr" "stats"
[21] "graphics" "grDevices" "utils" "datasets" "methods"
[26] "base"
[[2]]
[1] "ggnewscale" "ggtree" "printr" "cowplot" "randomcoloR"
[6] "readxl" "forcats" "stringr" "dplyr" "purrr"
[11] "readr" "tidyr" "tibble" "tidyverse" "ggplot2"
[16] "phytools" "maps" "ape" "BiocManager" "knitr"
[21] "stats" "graphics" "grDevices" "utils" "datasets"
[26] "methods" "base"
# Note: If you choose to work in the console and not knit your .Rmd, set your working directory to YOUR location of the session 7 folder
setwd("~/Desktop/code_club_curriculum/Session_7/")
# Load .treefile using the ape package ()
tree <- read.tree(file="./2021_02_12_08_34_28_KPNIH1_genome_aln_w_alt_allele_unmapped.treefile")
# Question: What type of object is the .treefile?
# Answer: A list of the class "phylo."
class(tree)
[1] "phylo"
# Visualize tree object using the ggtree function
# Help page documentation for ggtree function
help(ggtree)
| ggtree | R Documentation |
ggtree provides functions for visualizing phylogenetic tree and its associated data in R.
If you use ggtree in published research, please cite: Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
drawing phylogenetic tree from phylo object
ggtree( tr, mapping = NULL, layout = "rectangular", open.angle = 0, mrsd = NULL, as.Date = FALSE, yscale = "none", yscale_mapping = NULL, ladderize = TRUE, right = FALSE, branch.length = "branch.length", root.position = 0, xlim = NULL, ... )
tr
|
phylo object |
mapping
|
aesthetic mapping |
layout
|
one of ‘rectangular’, ‘dendrogram’, ‘slanted’, ‘ellipse’, ‘roundrect’, ‘fan’, ‘circular’, ‘inward_circular’, ‘radial’, ‘equal_angle’, ‘daylight’ or ‘ape’ |
open.angle
|
open angle, only for ‘fan’ layout |
mrsd
|
most recent sampling date |
as.Date
|
logical whether using Date class in time tree |
yscale
|
y scale |
yscale_mapping
|
yscale mapping for category variable |
ladderize
|
logical (default |
right
|
logical. If |
branch.length
|
variable for scaling branch, if ‘none’ draw cladogram |
root.position
|
position of the root node (default = 0) |
xlim
|
x limits, only works for ‘inward_circular’ layout |
…
|
additional parameter some dot arguments:
|
tree
Yu Guangchuang
G Yu, TTY Lam, H Zhu, Y Guan (2018). Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution, 35(2):3041-3043. https://doi.org/10.1093/molbev/msy194
G Yu, DK Smith, H Zhu, Y Guan, TTY Lam (2017). ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution, 8(1):28-36. https://doi.org/10.1111/2041-210X.12628
geom_tree()
require(ape) tr <- rtree(10) ggtree(tr)
# Use the ggtree function to view phylogeny
ggtree(tree)
# Exercise: Add a (1) treescale using geom_treescale and (2) overlay a title using a ggplot2 functions
ggtree(tree) + geom_treescale(x=0,y=-10,offset=1) + ggtitle("Initial Phylogeny")
# Question: After adding the scale to the phylogeny, what are your takeaways? Does this tell you anything about the genomic diversity of the isolates used in the tree?
# Answer: Notice the differential branch lengths in the phylogeny, specifically that there are ~ 400 isolates with a smaller distance between edges.
# Exercise: Identify the location of the tip labels for this tree
head(tree$tip.label)
[1] "gi_661922017_gb_CP008827.1_" "PCMP_H100"
[3] "PCMP_H98" "PCMP_H123"
[5] "PCMP_H141" "PCMP_H376"
# Drop Reference group using the treeio package's drop.tip function
tree_wo_root <- treeio::drop.tip(tree,'gi_661922017_gb_CP008827.1_')
# View tree without reference group
ggtree(tree_wo_root) + ggtitle("Tree Without the Reference Group")
# Question: Would midpoint rooting be valuable for this phylogeny?
#Answer: Midpoint root a phylogeny involves locating the longest path between any two tips and putting the root at that location
# Midpoint root using phytools rooting tool
tree_final <- phytools::midpoint.root(tree_wo_root)
# View midpoint rooted tree again
ggtree(tree_final) + ggtitle("Midpoint Rooted Tree")
# Question: What is the difference in the tree_wo_root and tree_final?
# Plot the ggtree images side-by-side using the cowplot package's plot_grid function
plot_grid(ggtree(tree_wo_root), ggtree(tree_final), ncol=2,labels = c('Without Midpoint Rooting','Midpoint Root'))
# Import CRKP Metadata
CRKP_metadata <- as.data.frame(read_xlsx('./CRKP_metadata.xlsx'))
# Exercise: Characterize the dataset
#1. What are the variables in this dataframe?
# Answer: isolate_no, LTACH, Region, Source, ST (Sequence Type), and Colistin (Susceptibility)
names(CRKP_metadata)
[1] "isolate_no" "LTACH" "Region" "Source" "ST"
[6] "Colistin"
#2. How many isolates are in this dataframe?
# Hint: Each row = a unique isolate
# Answer: 458 isolates
nrow(CRKP_metadata)
[1] 458
#3. Does the number of isolates in the metadata dataframe equal the number of isolates in the phylogeny?
# Answer: No. Not all isolates were sequenced and included in the phylogeny.
nrow(CRKP_metadata) == tree$Nnode
[1] FALSE
#4. Does the row names of the dataframe match the tip labels in the tree?
# Answer: No. We must ensure that row names of the dataframe match the tip labels of the phylogeny to ensure proper mapping of the data to the phylogeny.
head(rownames(CRKP_metadata))
[1] "1" "2" "3" "4" "5" "6"
head(tree_final$tip.label)
[1] "PCMP_H92" "PCMP_H366" "PCMP_H100" "PCMP_H98" "PCMP_H123" "PCMP_H141"
# Create rownames for sorting the dataframe to match the tiplabels of the tree
rownames(CRKP_metadata) <- paste0("PCMP_H",CRKP_metadata$isolate_no)
# Subset and sort the dataframe using the match function
CRKP_metadata_final <- as.data.frame(CRKP_metadata[match(as.vector(tree_final$tip.label), row.names(CRKP_metadata)), ])
# Question: What is the distribution of sequence types within the dataset?
# Data table:
table(CRKP_metadata_final$ST)
| 107 | 15 | 15* | 17 | 1832 | 193 | 2237 | 258 | 307 | 34 | 345 | 36 | 418 | ND | Novel* |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 10 | 2 | 2 | 1 | 1 | 1 | 411 | 12 | 1 | 1 | 2 | 1 | 6 | 1 |
# Bar plot
ggplot(data = CRKP_metadata_final, aes(x = ST)) +
geom_bar()
# Add tips to the tree to visualize distribution of sequence types across the phylogeny
# Create tip for sequence type
ST_tip <- c(CRKP_metadata_final$ST,rep(NA,tree_final$Nnode))
# Visualize tree with sequence type as tips
ggtree(tree_final) + geom_tippoint(aes(color=ST_tip,legend_title="Sequence Types"))
# Question: How is sequence type distributed across the phylogeny?
# Answer: The clade with a relatively lower distances between tips are predominantly ST258 isolates
# Subset the tree to include only ST258 isolates
# Create ST258 dataframe
CRKP_258 <- subset(CRKP_metadata_final,ST == "258")
# Subset phylogeny to include only ST258 isolates
tree_258 <- drop.tip(tree_final,tree_final$tip.label[!tree_final$tip.label %in% rownames(CRKP_258)])
# Sort dataframe by tip label
CRKP_258_sorted <- CRKP_258[match(tree_258$tip.label, rownames(CRKP_258)), ]
# Visualize ST258 phylogeny
ggtree(tree_258)
# Visualize ST258 phylogeny using different layouts
# Default (layout = "rectangular")
ggtree(tree_258)
# Circular
ggtree(tree_258,layout = "circular")
# Slanted
ggtree(tree_258,layout = "slanted")
# Dendogran
ggtree(tree_258,layout = "dendrogram")
# Ape
ggtree(tree_258,layout = "ape")
# No Branch Length
ggtree(tree_258,layout="rectangular",branch.length = 'none')
# Exercise 1: Create a tip label for Source
# 1. Tabulate source data
table(CRKP_258_sorted$Source)
| Blood | Respiratory | Urine | Wound |
|---|---|---|---|
| 34 | 231 | 140 | 6 |
# 2. Create tip for Source
Source_tip <- c(CRKP_258_sorted$Source,rep(NA,tree_258$Nnode))
# 3. Visualize
ggtree(tree_258) + geom_tippoint(aes(color=Source_tip),size=2) + labs(color="Source")
# 4. Store as tree_258_source
tree_258_source <- ggtree(tree_258) + geom_tippoint(aes(color=Source_tip),size=2) + labs(color="Source")
# Exercise 2: Create a tip label for Colistin Resistance
# 1. How many isolates are susceptible to colistin?
# Answer: 275 (66.9%)
table(CRKP_258_sorted$Colistin)
| Non-Susceptible | Susceptible |
|---|---|
| 136 | 275 |
# 2. Create tip for colistin resistance
Colistin <- c(CRKP_258_sorted$Colistin,rep(NA,tree_258$Nnode))
# 3. Visualize and improve size
ggtree(tree_258) + geom_tippoint(aes(color=Colistin,legend_title="Colistin Resistance"),size=2)
# Exercise 3: Create a phylogeny with geom_tippoint (1) color = source and (2) shape = colistin resistance
ggtree(tree_258) + geom_tippoint(aes(color=Source_tip,shape=Colistin,legend_title="Region"),size=2)+ labs(color="Source")
# gheatmap Help Page
help(gheatmap)
| gheatmap | R Documentation |
append a heatmap of a matrix to right side of phylogenetic tree
gheatmap( p, data, offset = 0, width = 1, low = "green", high = "red", color = "white", colnames = TRUE, colnames_position = "bottom", colnames_angle = 0, colnames_level = NULL, colnames_offset_x = 0, colnames_offset_y = 0, font.size = 4, family = "", hjust = 0.5, legend_title = "value" )
p
|
tree view |
data
|
matrix or data.frame |
offset
|
offset of heatmap to tree |
width
|
total width of heatmap, compare to width of tree |
low
|
color of lowest value |
high
|
color of highest value |
color
|
color of heatmap cell border |
colnames
|
logical, add matrix colnames or not |
colnames_position
|
one of ‘bottom’ or ‘top’ |
colnames_angle
|
angle of column names |
colnames_level
|
levels of colnames |
colnames_offset_x
|
x offset for column names |
colnames_offset_y
|
y offset for column names |
font.size
|
font size of matrix colnames |
family
|
font of matrix colnames |
hjust
|
hjust for column names (0: align left, 0.5: align center, 1: align righ) |
legend_title
|
title of fill legend |
tree view
Guangchuang Yu
# Initial gheatmap of Colistin Resistance (without formatting)
gheatmap(ggtree(tree_258),CRKP_258_sorted %>% select(Colistin))+ ylim(NA, 450)
# Stepwise formatting of gheatmap
# Change Heatmap Structure: Angle, Offset, Width, Justification, and Font Size
gheatmap(ggtree(tree_258), CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + ylim(NA, 450)
# Change Heatmap Fill Color and Legend Name: scale_fill_manual
gheatmap(ggtree(tree_258), CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + ylim(NA, 450)
# Change Legend Information: Theme
gheatmap(ggtree(tree_258), CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + theme(legend.position ='bottom',legend.key = element_rect(colour=c('black')), legend.direction="horizontal",legend.title = element_text(size=14),legend.text = element_text(size=8)) + ylim(NA, 475)
# Exersise: Add stored ST258 phylogeny that has source tips to the gheatmap function created above
# Hint: Format of gheatmap function = gheatmap(phylogeny, data, options)
gheatmap(tree_258_source, CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + theme(legend.position ='bottom',legend.key = element_rect(colour=c('black')), legend.direction="horizontal",legend.title = element_text(size=14),legend.text = element_text(size=8)) + ylim(NA,475)
# Exercise: Plot source and resistance (Colistin) in the same gheatmap
gheatmap(ggtree(tree_258),CRKP_258_sorted %>% select(Source,Colistin))+ ylim(NA, 450)
# Question: What is 'wrong' about the figure above? How can you fix it?
# Answer: Note the issue with the legend. Each variable has different values. Thus, they should have different scales!
#Introducing: ggnewscale's new_scale_fill function!
help(new_scale_fill)
| new_scale | R Documentation |
Creates a new scale "slot". Geoms added to a plot after this function will use a new scale definition.
new_scale(new_aes) new_scale_fill() new_scale_color() new_scale_colour()
new_aes
|
A string with the name of the aesthetic for which a new scale will be created. |
new_scale_color(), new_scale_colour() and new_scale_fill() are just aliases to new_scale(“color”), etc…
library(ggplot2)
# Equivalent to melt(volcano), but we don't want to depend on reshape2
topography <- expand.grid(x = 1:nrow(volcano),
y = 1:ncol(volcano))
topography$z <- c(volcano)
# point measurements of something at a few locations
measurements <- data.frame(x = runif(30, 1, 80),
y = runif(30, 1, 60),
thing = rnorm(30))
ggplot(mapping = aes(x, y)) +
geom_contour(data = topography, aes(z = z, color = stat(level))) +
# Color scale for topography
scale_color_viridis_c(option = "D") +
# geoms below will use another color scale
new_scale_color() +
geom_point(data = measurements, size = 3, aes(color = thing)) +
# Color scale applied to geoms added after new_scale_color()
scale_color_viridis_c(option = "A")
# Generate of Unique Colors
# Generate Distinct Color Palette Using the distinctColorPallete() function
# Note: You can also use the randomColor function to generate a vector of random colors.
palette_source <- distinctColorPalette(4)
# View Color Palette Chosen
pie(rep(1,length(palette_source)),col=palette_source)
# Generate Heatmap Using ggnewscale
# Phylogeny with Source Heatmap Column
p1 <- gheatmap(ggtree(tree_258),CRKP_258_sorted %>% select(Source),colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=6) + scale_fill_manual(values=palette_source,name="Source") + ylim(NA,500)
print(p1)
#Use new_scale_fill to add optionc
p2 <- p1 + new_scale_fill()
#Add Resistance Column to Heatmap
p3 <- gheatmap(p2, CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05, hjust=0, font.size=6, offset=.0000018) + theme(legend.position ='bottom',legend.key = element_rect(colour=c('black')), legend.direction="horizontal",legend.title = element_text(size=14),legend.text = element_text(size=8))+ geom_treescale(x=0,y=-15,offset=1) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + ylim(NA, 500)
print(p3)